#!/bin/sh
# Start/stop ceph daemons
# chkconfig: 2345 60 80

### BEGIN INIT INFO
# Provides:          ceph
# Default-Start:     2 3 4 5
# Default-Stop:      0 1 6
# Required-Start:    $remote_fs $named $network $time
# Required-Stop:     $remote_fs $named $network $time
# Short-Description: Start Ceph distributed file system daemons at boot time
# Description:       Enable Ceph distributed file system services.
### END INIT INFO

# TODO: on FreeBSD/OSX, use equivalent script file
if [ -e /lib/lsb/init-functions ]; then
    . /lib/lsb/init-functions
fi

# detect systemd, also check whether the systemd-run binary exists
SYSTEMD_RUN=$(which systemd-run 2>/dev/null)
grep -qs systemd /proc/1/comm || SYSTEMD_RUN=""

# if we start up as ./init-ceph, assume everything else is in the
# current directory too.
if [ `dirname $0` = "." ] && [ $PWD != "/etc/init.d" ]; then
    BINDIR=.
    SBINDIR=.
    LIBDIR=.
    LIBEXECDIR=.
    ETCDIR=.
    SYSTEMD_RUN=""
    ASSUME_DEV=1
else
    BINDIR=/usr/bin
    SBINDIR=/usr/sbin
    LIBDIR=/usr/lib64/ceph
    LIBEXECDIR=$LIBDIR
    ETCDIR=/etc/ceph
    ASSUME_DEV=0
fi

if [ -n "$CEPH_BIN" ] && [ -n "$CEPH_ROOT" ] && [ -n "$CEPH_BUILD_DIR" ]; then
  BINDIR=$CEPH_BIN
  SBINDIR=$CEPH_ROOT/src
  ETCDIR=$CEPH_BIN
  LIBEXECDIR=$CEPH_ROOT/src
  SYSTEMD_RUN=""
  ASSUME_DEV=1
fi

################################################
#### WRS: Check if a ceph process is hung ####
################################################

# Each Ceph process has a state and a status.
#  - States are used by the this script for decisions
#  - Statuses are $UP or $DOWN and represent a process working state as reported by Ceph   
# OSD processes can have a status of "up" or "down" based on what Ceph is
# reporting. We ignore the "in" status (we consider it as "Down")
# Ceph processes states:
# 1. STARTUP - a process is started but has not yet joined the cluster
# 2. OPERATIONAL - a process joined the cluster
# 3. HANGED - a process is hung
# 4. STOPPED - a process is offline (/etc/init.d start was not executed)
# Monitoring is done differently based on state
# All timers are in seconds

# FSM states
ST_STARTUP="STARTED"
ST_OPER="OPERATIONAL"
ST_HANGED="HANGED"
ST_STOPPED="STOPPED"

# Ceph OSD status
UP="up"
DOWN="down"

#paths
source /usr/bin/tsconfig
DATA_PATH=$VOLATILE_PATH/ceph_hang    # folder where we keep sate information
LOG_PATH=/var/log/ceph
LOG_FILE=$LOG_PATH/ceph-process-states.log
mkdir -p $DATA_PATH         # make sure folder exists

#timeouts
WAIT_FOR_CMD=10                 # max wait for a response from Ceph
WAIT_FOR_OSD_OPERATIONAL=300    # max wait for OSD to go 'up' when in startup state
WAIT_FOR_OSD_DOWN_CONFIRM=300   # even if OSD is down it may be flapping, wait for a while
WAIT_FOR_MON_OPERATIONAL=60     # max wait for MON to go 'up' when in startup state
WAIT_FOR_MON_DOWN_CONFIRM=300   # even if MON is down it may be flapping, wait for a while

# Blocked Ops Detection (values in seconds)
BLOCKED_OPS_DETECTION_ENABLED="false" # Enable/Disable detection
BLOCKED_OPS_START_DETECTION=300      # Wait on OSD startup before handling blocked ops
BLOCKED_OPS_RESTART_THRESH=480       # Restart OSD if blocked reqs are longer that this

LOG_LEVEL=NORMAL  # DEBUG

save_proc_state() {
    # Set the state of a process in the state machine and store it in a file
    local name=$1
    local state=$2
    if [ "$state" != "$ST_STARTUP" ] && [ "$state" != "$ST_OPER" ] && \
       [ "$state" != "$ST_HANGED" ] && [ "$state" != "$ST_STOPPED" ]; then
        wlog $name "ERROR" "State $state is invalid, resetting to $ST_STARTUP" print_trace
        state=$ST_STARTUP
    fi
    echo "$state" > ${DATA_PATH}/.${name}_state
}

load_proc_state() {
    # Get the state of the state machine from file and validate content
    local name=$1
    local state=$ST_STARTUP
    if [ -f ${DATA_PATH}/.${name}_state ]; then
        state=$(cat ${DATA_PATH}/.${name}_state)
    fi
    if [ "$state" != "$ST_STARTUP" ] && [ "$state" != "$ST_OPER" ] && \
       [ "$state" != "$ST_HANGED" ] && [ "$state" != "$ST_STOPPED" ]; then
        wlog $name "ERROR" "State $state is invalid, resetting to $ST_STARTUP" print_trace
        state=$ST_STARTUP
        save_proc_state $name $state
    fi
    echo "$state"; return
}

save_proc_startup_ok() {
    # Reset to initial state after a process started successfully (i.e. it has a valid pid)
    local name=$1

    # Process just started, clear all records, reset state
    rm -f ${DATA_PATH}/.${name}_start_time
    rm -f ${DATA_PATH}/.${name}_down_time
    save_proc_state $name $ST_STARTUP    

    # Save the time when a process was started
    wlog $name INFO "Process $ST_STARTUP successfully, waiting for it to become $ST_OPER"
    echo $(date +%s) > ${DATA_PATH}/.${name}_start_time
}

save_proc_status() {
    # Store the status of a process ($UP or $DOWN) and its start time
    # Note that a process status is different from its states
    local name=$1
    local status=$2
    if [ "$status" == $UP ]; then
        rm -f ${DATA_PATH}/.${name}_down_time
    else
        if [ ! -f ${DATA_PATH}/.${name}_down_time ]; then
            echo $(date +%s) > ${DATA_PATH}/.${name}_down_time
        fi
    fi
}

load_proc_status() {
    # Load a process status ($UP or $DOWN)
    local name=$1
    if [ -f ${DATA_PATH}/.${name}_down_time ]; then
        echo $DOWN
    else
        echo $UP
    fi
}

get_duration(){
    # Get duration based on file record and current time
    local name=$1
    local record=$2  # filename with prev time

    # Check that we have a filename 
    if [ ! -f $record ]; then
        wlog $name ERROR "Failed to compute duration, time was never stored in $record!" print_trace
        echo "-1"; return
    fi

    # Get and validate time previously saved in file
    local start_time=$(cat $record)
    re="^[0-9]+$"
    if ! [[ "$start_time" =~ $re ]] ; then
       wlog $name ERROR "Recorded time '$start_time' is not a number!" print_trace
       echo "-1"; return
    fi

    # Compute duration
    local now=$(date +%s)
    local duration=$((now-start_time))
    if [ "$duration" -lt 0 ]; then
        wlog $name ERROR "Duration less than 0!" print_trace
        echo "-1"; return
    fi
    echo $duration
}

get_proc_run_time() {
    local name=$1
    local time=$(get_duration $name ${DATA_PATH}/.${name}_start_time)
    wlog $name DEBUG ">>> process running for: ${time}s"
    echo $time
}

get_proc_down_time() {
    local name=$1
    local time=$(get_duration $name ${DATA_PATH}/.${name}_down_time)
    wlog $name DEBUG ">>> process down for: ${time}s"
    echo $time
}

run_state_machine() {
    # Small state machine, returns process state as defined in ST_* constants
    # Same logic apply to both ceph-osd and ceph-mon, only timeouts are different
    local name=$1       # 'osd.<number>' or 'mon.<hostname>'
    local type=$2       # 'osd' or 'mon'
    local status=$3     # daemon current status ($UP or $DOWN)
    local wait_for_operational=$4  # how much time to wait for a process to go up
    local wait_for_down_confirm=$5 # how much time to wait before reporting a process as down

    local state=$(load_proc_state $name)

    wlog $name "DEBUG" ">>>  state: $state"
    # state machine
    if [ "$state" = "$ST_STARTUP" ]; then
        wlog $name "DEBUG" ">>> status: $status"
        if [ "$status" = $UP ]; then
            save_proc_state $name $ST_OPER
            wlog $name "INFO" "Process is OPERATIONAL"
            echo $ST_OPER; return
        else
            # the process should be 'up' in $WAIT_FOR_OSD_OPERATIONAL seconds!
            if [ $(get_proc_run_time $name) -gt $wait_for_operational ]; then
                # process hung!
                wlog $name "ERROR" "Process failed to go up in ${wait_for_operational}s after start, reporting it as $ST_HANGED!"
                save_proc_state $name $ST_HANGED
                echo $ST_HANGED; return
            fi
        fi
    elif [ "$state" = "$ST_OPER" ]; then
        if [ "$status" = "$DOWN" ]; then
            if [ $(load_proc_status $name) = $UP ];then
                wlog $name "WARN" "Process went down!"
                save_proc_status $name $DOWN
            fi
            # if a process is down we don't report it as hung for a while
            # this should avoid status flapping
            if [ $(get_proc_down_time $name) -gt $wait_for_down_confirm ]; then
                # the process is down for a long time, report it as hung!
                wlog $name "ERROR" "Process went down for more than ${wait_for_down_confirm}s, reporting it as $ST_HANGED"
                save_proc_state $name $ST_HANGED
                echo $ST_HANGED; return
            fi
        elif [ "$status" = "$UP" ]; then
            if [ $(load_proc_status $name) = $DOWN ]; then
                wlog $name "WARN" "Process went up, flapping status or busy process?"
                save_proc_status $name $UP
                return
            fi
        fi
    elif [ "$state" = "$ST_HANGED" ] || [ "$state" = "$ST_STOPPED" ]; then
        # nothing to do, resetting from these states is done externally in /etc/ceph/ceph_pmon_wrapper.sh
        echo $state; return
    fi
}

CEPH_FAILURE=""
execute_ceph_cmd() {
    # execute a comand and in case it timeouts mark ceph as failed
    local name=$1
    local cmd=$2
    local cmd="timeout $WAIT_FOR_CMD $cmd"
    set -o pipefail
    eval "$cmd >$DATA_PATH/.ceph_cmd_out"
    errcode=$?
    set +o pipefail
    if [ -z "$output" ] && [ $errcode -eq 124 ]; then  # 'timeout' returns 124 when timing out
        wlog $name "WARN" "Ceph cluster failed to respond in ${WAIT_FOR_CMD}s when running: $cmd"
        CEPH_FAILURE="true"
        echo ""; return 1
    fi
    output=$(cat $DATA_PATH/.ceph_cmd_out)
    if [ -z "$output" ] || [ $errcode -ne 0 ]; then
        wlog $name "WARN" "Error executing: $cmd errorcode: $errcode output: $output"
        echo ""; return 1
    fi
    echo "$output"; return $errcode
}

CEPH_OSD_TREE=""
CEPH_HEALTH_DETAIL=""
is_process_hung() {
    local name=$1
    local type=$2  # 'osd' or 'mon'

    # Abort if we had previous errors with Ceph
    if [ "$CEPH_FAILURE" = "true" ]; then
        wlog $name "WARN" "Ceph cluster is marked as failed, aborting hang check"
        echo "false"; return
    fi

    # Cache Ceph Health for later use as calling Ceph takes time
    if [ -z "$CEPH_HEALTH_DETAIL" ]; then
        execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail | tail -n 50"
        if [ $? -ne 0 ]; then
            wlog $name "WARN" "Aborting hang check"
            echo "false"; return
        fi
    fi

    # Check if an OSD is hung
    if [ "$type" = "osd" ]; then
        # Ignore health check if OSDs are administratively down
        # Note this can be done with: 'ceph osd set noup; ceph osd down <osd.id>'
        $(echo "$CEPH_HEALTH_DETAIL" | grep -q "^noup.*set")
        if [ $? -eq 0 ]; then
           wlog $name "WARN" "Ceph 'noup' flag is set, aborting hang check"
           echo "false"; return
        fi

        # Multiple OSD processes may be running, so we only run
        # 'ceph osd tree' once as it takes some time to execute
        if [ -z "$CEPH_OSD_TREE" ]; then
            execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree"
            if [ $? -ne 0 ]; then
                wlog $name "WARN" "Ceph cmd exec failed, aborting hang check"
                echo "false"; return
            fi
        fi

        # Get osd status as 'up' or, for any other output, as 'down'
        echo "$CEPH_OSD_TREE" | grep $name | grep -q "up"
        if [ "$?" -eq 0 ]; then
            osd_status=$UP
        else
            osd_status=$DOWN
        fi

        local state=$(run_state_machine $name $type $osd_status \
                      $WAIT_FOR_OSD_OPERATIONAL $WAIT_FOR_OSD_DOWN_CONFIRM)
        if [ "$state" = "$ST_HANGED" ]; then
            echo "true"; return
        else
            echo "false"; return
        fi


     # Check if a Monitor is hung
     elif [ "$type" = "mon" ]; then
        # Get monitor status info
        local mon_status=$UP
        echo "$CEPH_HEALTH_DETAIL" | grep -q -e "^$name.*down"
        if [ $? -eq 0 ]; then
            mon_status=$DOWN
        fi
        
        local state=$(run_state_machine $name $type $mon_status \
                      $WAIT_FOR_MON_OPERATIONAL $WAIT_FOR_MON_DOWN_CONFIRM)
        if [ "$state" = "$ST_HANGED" ]; then
            echo "true"; return
        else
            echo "false"; return
        fi

     else
        wlog $name "WARN" "Unknown process type: $type"
     fi
   echo "false"
}


osd_has_blocked_ops() {
    local name=$1

    # Abort if we had previous errors with Ceph
    if [ "$CEPH_FAILURE" = "true" ]; then
        wlog $name "WARN" "Ceph cluster is marked as failed, aborting blocked ops check"
        echo "false"; return
    fi

    # Cache Ceph Health for later use as calling Ceph takes time This is
    # initially cached from the hang check but check and call again here if
    # needed
    if [ -z "$CEPH_HEALTH_DETAIL" ]; then
        execute_ceph_cmd CEPH_HEALTH_DETAIL $name "ceph health detail | tail -n 50"
        if [ $? -ne 0 ]; then
            wlog $name "WARN" "Aborting blocked ops check"
            echo "false"; return
        fi
    fi

    # Ignore health check if OSDs are administratively down
    # Note this can be done with: 'ceph osd set noup; ceph osd down <osd.id>'
    $(echo "$CEPH_HEALTH_DETAIL" | grep -q "^noup.*set")
    if [ $? -eq 0 ]; then
        wlog $name "WARN" "Ceph 'noup' flag is set, aborting blocked ops check"
        echo "false"; return
    fi

    # Multiple OSD processes may be running, so we only run 'ceph osd tree' once
    # as it takes some time to execute. This is initially cached from the hang
    # check but check and call again here if needed
    if [ -z "$CEPH_OSD_TREE" ]; then
        execute_ceph_cmd CEPH_OSD_TREE $name "ceph osd tree"
        if [ $? -ne 0 ]; then
            wlog $name "WARN" "Ceph cmd exec failed, aborting blocked ops check"
            echo "false"; return
        fi
    fi

    # Get osd status as 'up' or, for any other output, as 'down'
    echo "$CEPH_OSD_TREE" | grep $name | grep -q "up"
    if [ "$?" -eq 0 ]; then
        # Look for and parse:'1 ops are blocked > 1048.58 sec on osd.1'
        local blocked_time=$(echo "$CEPH_HEALTH_DETAIL" | grep $name | sed -rn 's/.*ops are blocked > ([[:digit:]]*).*/\1/p')
        if [ $blocked_time -gt $BLOCKED_OPS_RESTART_THRESH ]; then
            wlog $name "WARN" "Detected blocked operations for $blocked_time seconds"
            echo "true"; return
        else
            echo "false"; return
        fi
    fi
}

################
#### WRS END ###
################

usage_exit() {
    echo "usage: $0 [options] {start|stop|restart|condrestart} [mon|osd|mds]..."
    printf "Core options:\n"
    printf "\t--allhosts / -a           execute (via ssh) on all hosts in conf file\n"
    printf "\t--cluster [cluster name]  define the cluster name\n"
    printf "\t--conf / -c [conf file]   use [conf file] instead of default\n"
    printf "\t--help / -h               show this usage message\n"
    printf "\t--hostname [hostname]     override hostname lookup\n"
    printf "\t-m [mon addr]             mon address\n"
    printf "\n"
    printf "Other options:\n"
    printf "\t--btrfs                   btrfs\n"
    printf "\t--nobtrfs                 no btrfs\n"
    printf "\t--btrfsumount             btrfs umount\n"
    printf "\t--fsmount                 fsmount\n"
    printf "\t--nofsmount               no fsmount\n"
    printf "\t--fsumount                fsumount\n"
    printf "\t--restart                 restart on core dump\n"
    printf "\t--norestart               do not restart on core dump\n"
    printf "\t--valgrind                run via valgrind\n"
    printf "\t--novalgrind              do not run via valgrind\n"
    printf "\t--verbose / -v            be verbose\n"
    exit
}

# behave if we are not completely installed (e.g., Debian "removed,
# config remains" state)
test -f $LIBEXECDIR/ceph_common.sh || exit 0

. $LIBEXECDIR/ceph_common.sh

EXIT_STATUS=0

signal_daemon() {
    name=$1
    daemon=$2
    pidfile=$3
    signal=$4
    action=$5
    [ -z "$action" ] && action="Stopping"
    printf "$action Ceph $name on $host..."
    do_cmd "if [ -e $pidfile ]; then
        pid=\`cat $pidfile\`
        if ps -p \$pid -o args= | grep -q $daemon; then
	    cmd=\"kill $signal \$pid\"
	    printf \"\$cmd...\"
	    \$cmd
        fi
    fi"
    echo done
}

daemon_is_running() {
    name=$1
    daemon=$2
    daemon_id=$3
    pidfile=$4
    do_cmd "[ -e $pidfile ] || exit 1   # no pid, presumably not running
	pid=\`cat $pidfile\`
	ps -p \$pid -o args= | grep $daemon | grep -qwe -i.$daemon_id && exit 0 # running
        exit 1  # pid is something else" "" "okfail"
}

stop_daemon() {
    name=$1
    daemon=$2
    pidfile=$3
    signal=$4
    action=$5
    timeout=$6
    [ -z "$action" ] && action="Stopping"
    printf "$action Ceph $name on $host..."
    do_cmd "if [ -e $pidfile ] ; then 
	pid=\`cat $pidfile\`
        timeout=$timeout
	while ps -p \$pid -o args= | grep -q $daemon; do
            if [ ! -z "$timeout" ]; then
                if [ \$timeout -lt 0 ]; then
                    break
                fi
                timeout-=1
            fi
	    cmd=\"kill $signal \$pid\"
	    printf \"\$cmd...\"
	    \$cmd
	    sleep 1
	    continue
	done
    fi"
    echo done
}

## command line options
options=

OPTS=$(getopt -n 'init-ceph' -o 'hvam:c:' -l 'help,verbose,valgrind,novalgrind,allhosts,restart,norestart,btrfs,nobtrfs,fsmount,nofsmount,btrfsumount,fsumount,conf:,cluster:,hostname:' -- "$@")
if [ $? != 0 ]
then
    exit 1
fi

eval set -- "$OPTS"

dovalgrind=
docrun=
allhosts=0
monaddr=
dofsmount=1
dofsumount=0
verbose=0
use_default_conf=1

## set variables like cluster or conf
[ -e /etc/sysconfig/ceph ] && . /etc/sysconfig/ceph
[ -e /etc/default/ceph ] && . /etc/default/ceph


while echo $1 | grep -q '^-'; do     # FIXME: why not '^-'?
case $1 in
    -v | --verbose)
	    verbose=1
	    ;;
    --valgrind)
	    dovalgrind=1
	    ;;
    --novalgrind)
	    dovalgrind=0
	    ;;
    --allhosts | -a)
	    allhosts=1;
	    ;;
    --restart)
	    docrun=1
	    ;;
    --norestart)
	    docrun=0
	    ;;
    -h | --help)
            usage_exit
            ;;
    -m )
	    [ -z "$2" ] && usage_exit
	    options="$options $1"
	    shift
	    MON_ADDR=$1
	    ;;
    --btrfs | --fsmount)
	    dofsmount=1
	    ;;
    --nobtrfs | --nofsmount)
	    dofsmount=0
	    ;;
    --btrfsumount | --fsumount)
	    dofsumount=1
	    ;;
    --conf | -c)
	    [ -z "$2" ] && usage_exit
	    options="$options $1"
	    shift
        use_default_conf=0
	    conf=$1
	    ;;
    --cluster )
	    [ -z "$2" ] && usage_exit
	    options="$options $1"
	    shift
	    cluster=$1
	    ;;
    --hostname )
	    [ -z "$2" ] && usage_exit
	    options="$options $1"
	    shift
	    hostname=$1
            ;;
    -- )
            shift
            break
            ;;
    *)
	    echo unrecognized option \'$1\'
	    usage_exit
	    ;;
esac
options="$options $1"
shift
done

# if `--cluster` was not passed in, fallback to looking at the config name
if [ -z "$cluster" ]; then
    cluster=`echo $conf | awk -F'/' '{print $(NF)}' | cut -d'.' -f 1`
else
    # if we were told to use a given cluster name then $conf needs to be updated
    # but just define it if `--conf` was not specified, otherwise we would be silently
    # overriding $conf even if it was defined with `--conf`
    if [ $use_default_conf -eq 1 ]; then
        conf="/etc/ceph/$cluster.conf"
    fi
fi


verify_conf

command=$1
[ -n "$*" ] && shift

get_local_name_list
get_name_list "$@"

# Reverse the order if we are stopping
if [ "$command" = "stop" ]; then
    for f in $what; do
       new_order="$f $new_order"
    done
    what="$new_order"
fi

WAIT_OSD_STOP=""

for name in $what; do
    type=`echo $name | cut -c 1-3`   # e.g. 'mon', if $item is 'mon1'
    id=`echo $name | cut -c 4- | sed 's/^\\.//'`
    num=$id
    name="$type.$id"

    check_host $cluster || continue

    binary="$BINDIR/ceph-$type"
    cmd="$binary -i $id"
    if [ $ASSUME_DEV -eq 1 ]; then
      cmd="PATH=$PWD:$PATH $cmd"
    fi

    get_conf run_dir "/var/run/ceph" "run dir"

    get_conf pid_file "$run_dir/$type.$id.pid" "pid file"

    if [ "$command" = "start" ]; then
	if [ -n "$pid_file" ]; then
	    do_cmd "mkdir -p "`dirname $pid_file`
	    cmd="$cmd --pid-file $pid_file"
	fi

	get_conf log_dir "" "log dir"
	[ -n "$log_dir" ] && do_cmd "mkdir -p $log_dir"

        get_conf auto_start "" "auto start"
        if [ "$auto_start" = "no" ] || [ "$auto_start" = "false" ] || [ "$auto_start" = "0" ]; then
            if [ -z "$@" ]; then
                echo "Skipping Ceph $name on $host... auto start is disabled"
                continue
            fi
        fi

	if daemon_is_running $name ceph-$type $id $pid_file; then
	    echo "Starting Ceph $name on $host...already running"
	    continue
	fi

	get_conf copy_executable_to "" "copy executable to"
	if [ -n "$copy_executable_to" ]; then
	    scp $binary "$host:$copy_executable_to"
	    binary="$copy_executable_to"
	fi
    fi

    # conf file
    cmd="$cmd -c $conf"

    if echo $name | grep -q ^osd; then
	get_conf osd_data "/var/lib/ceph/osd/$cluster-$id" "osd data"
	get_conf fs_path "$osd_data" "fs path"  # mount point defaults so osd data
        get_conf fs_devs "" "devs"
	if [ -z "$fs_devs" ]; then
	    # try to fallback to old keys
	    get_conf tmp_btrfs_devs "" "btrfs devs"
	    if [ -n "$tmp_btrfs_devs" ]; then
		fs_devs="$tmp_btrfs_devs"
	    fi
	fi
        first_dev=`echo $fs_devs | cut '-d ' -f 1`
    fi

    # do lockfile, if RH
    get_conf lockfile "/var/lock/subsys/ceph" "lock file"
    lockdir=`dirname $lockfile`
    if [ ! -d "$lockdir" ]; then
	lockfile=""
    fi

    get_conf asok "$run_dir/$cluster-$type.$id.asok" "admin socket"

    case "$command" in
	start)
            # Increase max_open_files, if the configuration calls for it.
            get_conf max_open_files "32768" "max open files"

	        # Remove stale admin socket
	        [ -n "$asok" ] && rm -f $asok

	        # Wait for pending systemd core dumps
	        deadline=$(( $(date '+%s') + 300 ))
	        while [[ $(date '+%s') -lt "${deadline}" ]]; do
	             systemd_coredump_pid=$(pgrep -f "systemd-coredump.*ceph-${type}")
	             [[ -z "${systemd_coredump_pid}" ]] && break
	             wlog $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}"
	             sleep 1
	        done

            # build final command
	    wrap=""
	    runmode=""
	    runarg=""

	    [ -z "$docrun" ] && get_conf_bool docrun "0" "restart on core dump"
	    [ "$docrun" -eq 1 ] && wrap="$BINDIR/ceph-run"

	    [ -z "$dovalgrind" ] && get_conf_bool valgrind "" "valgrind"
	    [ -n "$valgrind" ] && wrap="$wrap valgrind $valgrind"

	    [ -n "$wrap" ] && runmode="-f &" && runarg="-f"
	    [ -n "$max_open_files" ] && files="ulimit -n $max_open_files;"

	    [ -n "$TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES" ] && tcmalloc="TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=$TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES"

	    if [ -n "$SYSTEMD_RUN" ]; then
                time=`date +%s.%N` 
		# TIS: not running as ceph user/group
		cmd="$SYSTEMD_RUN --unit=ceph-$name.$time -r bash -c '$files $tcmalloc $cmd --cluster $cluster -f'"
	    else
		# TIS: not running as ceph user/group
		cmd="$files $tcmalloc $wrap $cmd --cluster $cluster $runmode"
	    fi

	    if [ $dofsmount -eq 1 ] && [ -n "$fs_devs" ]; then
		get_conf pre_mount "true" "pre mount command"
		get_conf fs_type "" "osd mkfs type"

		if [ -z "$fs_type" ]; then
		    # try to fallback to to old keys
		    get_conf tmp_devs "" "btrfs devs"
		    if [ -n "$tmp_devs" ]; then
			fs_type="btrfs"
		    else
		        echo No filesystem type defined!
		        exit 0
                    fi
		fi

		get_conf fs_opt "" "osd mount options $fs_type"
		if [ -z "$fs_opt" ]; then
		    if [ "$fs_type" = "btrfs" ]; then
		        #try to fallback to old keys
			get_conf fs_opt "" "btrfs options"
		    fi

		    if [ -z "$fs_opt" ]; then
			if [ "$fs_type" = "xfs" ]; then
			    fs_opt="rw,noatime,inode64"
			else
		            #fallback to use at least noatime
		            fs_opt="rw,noatime"
			fi
		    fi
		fi

		[ -n "$fs_opt" ] && fs_opt="-o $fs_opt"
		[ -n "$pre_mount" ] && do_cmd "$pre_mount"

		do_root_cmd_okfail "mkdir -p $fs_path"
		if [ "$fs_type" = "btrfs" ]; then
		    echo Mounting Btrfs on $host:$fs_path
		    do_root_cmd_okfail "modprobe btrfs ; btrfs device scan || btrfsctl -a ; egrep -q '^[^ ]+ $fs_path ' /proc/mounts && umount $fs_path ; mount -t btrfs $fs_opt $first_dev $fs_path"
		else
		    echo Mounting $fs_type on $host:$fs_path
		    do_root_cmd_okfail "modprobe $fs_type ; egrep -q '^[^ ]+ $fs_path ' /proc/mounts && umount $fs_path ; mount -t $fs_type $fs_opt $first_dev $fs_path"
		fi
		if [ "$ERR" != "0" ]; then
		    EXIT_STATUS=$ERR
		    continue
		fi
	    fi

	    if [ "$type" = "osd" ]; then
		get_conf update_crush "" "osd crush update on start"
		case "${update_crush:-1}" in 1|[Tt][Rr][Uu][Ee])
		    # update location in crush
		    get_conf osd_location_hook "$BINDIR/ceph-crush-location" "osd crush location hook"
		    osd_location=`$osd_location_hook --cluster $cluster --id $id --type osd`
		    get_conf osd_weight "" "osd crush initial weight"
		    defaultweight="$(df -P -k $osd_data/. | tail -1 | awk '{ print sprintf("%.4f",$2/1073741824) }')"
		    get_conf osd_keyring "$osd_data/keyring" "keyring"
		    do_cmd_okfail "timeout 30 $BINDIR/ceph -c $conf --name=osd.$id --keyring=$osd_keyring osd crush create-or-move -- $id ${osd_weight:-${defaultweight:-1}} $osd_location"
		    if [ "$ERR" != "0" ]; then
			EXIT_STATUS=$ERR
			continue
		    fi
                    do_cmd_okfail "timeout 30 $BINDIR/ceph -c $conf --name=osd.$id --keyring=$osd_keyring osd crush reweight -- osd.$id ${osd_weight:-${defaultweight:-1}}"
                    if [ "$ERR" != "0" ]; then
                        EXIT_STATUS=$ERR
                        continue
                    fi
		esac
	    fi

	    echo Starting Ceph $name on $host...
	    if [ ! -d $run_dir ]; then
		# assume /var/run exists
		install -d -m0770 -o ceph -g ceph /var/run/ceph
	    fi
	    get_conf pre_start_eval "" "pre start eval"
	    [ -n "$pre_start_eval" ] && $pre_start_eval
	    get_conf pre_start "" "pre start command"
	    get_conf post_start "" "post start command"
	    [ -n "$pre_start" ] && do_cmd "$pre_start"
	    do_cmd_okfail "$cmd" $runarg
	    if [ "$ERR" != "0" ]; then
		EXIT_STATUS=$ERR
		continue
	    fi

            save_proc_startup_ok $name 

	    if [ "$type" = "mon" ]; then
		# this will only work if we are using default paths
		# for the mon data and admin socket.  if so, run
		# ceph-create-keys.  this is the case for (normal)
		# chef and ceph-deploy clusters, which is who needs
		# these keys.  it's also true for legacy installs
		# via mkcephfs, which is fine too; there is no harm
		# in creating these keys.
		get_conf mon_data "/var/lib/ceph/mon/$cluster-$id" "mon data"
		if [ "$mon_data" = "/var/lib/ceph/mon/$cluster-$id" -a "$asok" = "/var/run/ceph/$cluster-mon.$id.asok" ]; then
		    echo Starting ceph-create-keys on $host...
		    cmd2="$SBINDIR/ceph-create-keys --cluster $cluster -i $id 2> /dev/null &"
		    do_cmd "$cmd2"
		fi
	    fi

	    [ -n "$post_start" ] && do_cmd "$post_start"
	    [ -n "$lockfile" ] && [ "$?" -eq 0 ] && touch $lockfile
	    ;;

	stop)
	    get_conf pre_stop "" "pre stop command"
	    get_conf post_stop "" "post stop command"
	    [ -n "$pre_stop" ] && do_cmd "$pre_stop"

            wlog $name "INFO" "Stopping process"
    
            if [ $(load_proc_state $name) != "$ST_HANGED" ]; then
        	    stop_daemon $name ceph-$type $pid_file
            else
                # first try to gracefully close process, this should be fast if
                # its threads still respond to the TERM signal
                wlog $name "DEBUG" ">>> Sending term signal"
                stop_daemon $name ceph-$type $pid_file TERM "" 5
                wlog $name "DEBUG" ">>> Sending kill signal"
                # then just kill it
                stop_daemon $name ceph-$type $pid_file KILL
            fi
	    [ -n "$pidfile" ] && rm -f $pidfile
	    [ -n "$asok" ] && rm -f $asok
	    [ -n "$post_stop" ] && do_cmd "$post_stop"
	    [ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile
            # flush journal to data disk in background
            if [ "$type" = "osd" ];then
                WAIT_OSD_STOP="${WAIT_OSD_STOP} $id"
        	$(/usr/bin/ceph-osd -i $id --flush-journal) & 
            fi
            
            wlog $name "INFO" "Process stopped, setting state to $ST_STOPPED"
            save_proc_state $name $ST_STOPPED
            
	    if [ $dofsumount -eq 1 ] && [ -n "$fs_devs" ]; then
		echo Unmounting OSD volume on $host:$fs_path
		do_root_cmd "umount $fs_path || true"
	    fi
	    ;;

	status)
	    if daemon_is_running $name ceph-$type $id $pid_file; then
                # ceph processes answer in around 100ms when the process works correctly
                do_cmd "timeout 1 $BINDIR/ceph --admin-daemon $asok version 2>/dev/null || echo unknown"
                
                # check if daemon is hung
                is_hung=$(is_process_hung $name $type)
                if [ "$is_hung" = "true" ]; then
                    echo "$name: hung."
                    # based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html
                    # exit codes from 150 to 199 are application specific, therefore we define one here
                    EXIT_STATUS=150
                else
                    # Wait a period of time prior to OSD start before restarting based on slow/blocked requests
                    if [ "$type" = "osd" ] && [ $BLOCKED_OPS_DETECTION_ENABLED = "true" ]; then
                        up_time=$(get_proc_run_time $name)
                        if [ $up_time -gt $BLOCKED_OPS_START_DETECTION ]; then
                            has_blocked_ops=$(osd_has_blocked_ops $name)
                            if [ "$has_blocked_ops" = "true" ]; then
                                echo "$name: blocked ops."
                                # based on http://refspecs.linuxbase.org/LSB_3.1.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html
                                # exit codes from 150 to 199 are application specific, therefore we define one here
                                EXIT_STATUS=151
                            else
                                echo "$name: running."
                            fi
                        else
                            echo "$name: running."
                        fi
                    else
                        echo "$name: running."
                    fi
                fi
            elif [ -e "$pid_file" ]; then
                # daemon is dead, but pid file still exists
                echo "$name: dead."
                EXIT_STATUS=1
            else
                # daemon is dead, and pid file is gone
                echo "$name: not running."
                EXIT_STATUS=3
            fi
	    ;;

	ssh)
	    $ssh
	    ;;

	forcestop)
	    get_conf pre_forcestop "" "pre forcestop command"
	    get_conf post_forcestop "" "post forcestop command"
	    [ -n "$pre_forcestop" ] && do_cmd "$pre_forcestop"
	    stop_daemon $name ceph-$type $pid_file -9
	    [ -n "$post_forcestop" ] && do_cmd "$post_forcestop"
	    [ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile
	    ;;

	killall)
	    echo "killall ceph-$type on $host"
	    do_cmd "pkill ^ceph-$type || true"
	    [ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile
	    ;;

	force-reload | reload)
	    signal_daemon $name ceph-$type $pid_file -1 "Reloading"
	    ;;

	restart)
	    $0 $options stop $name
	    $0 $options start $name
	    ;;

        condrestart)
            if daemon_is_running $name ceph-$type $id $pid_file; then
                $0 $options stop $name
                $0 $options start $name
            else
                echo "$name: not running."
            fi
            ;;

	cleanlogs)
	    echo removing logs
	    [ -n "$log_dir" ] && do_cmd "rm -f $log_dir/$type.$id.*"
	    ;;

	cleanalllogs)
	    echo removing all logs
	    [ -n "$log_dir" ] && do_cmd "rm -f $log_dir/* || true"
	    ;;

	*)
	    usage_exit
	    ;;
    esac
done

# wait for journal flushing to complete
if [ $command == "stop" ]; then
    wait
    osd-wait-status -o ${WAIT_OSD_STOP} -n -s up -i 3 -d 300
fi

# activate latent osds?
if [ "$command" = "start" -a "$BINDIR" != "." ]; then
    if [ "$*" = "" ] || echo $* | grep -q ^osd\$ ; then
       if [ -x $SBINDIR/ceph-disk ]; then
           ceph-disk activate-all
       fi
    fi
fi

exit $EXIT_STATUS
